Load packages

rm(list = ls())

library(tictoc)  # to monitor time

library(raster) # read raster files
library(rgdal) # Use GDAL functions
library(exactextractr) # Zonal statistics from raster
library(sf) # Working with shapefile layers

library(tidyverse) # data wrangling
library(plyr) # data wrangling
library(dplyr) # data wrangling

library(ggpmisc) # To place geom_text within plot
# library(ggmap) # Plot leaflet
# library(ggplot2) # Visualizations
library(plotly) # Interactive visualization

print("Packages loaded successfully!")
## [1] "Packages loaded successfully!"

Functions specific to this RMD

#########################################
# Creating function for exponential fit
myexpfit <- function(response_var, pred_var, mydata){
  print(paste0("Model = ", "log(",response_var, ")~", pred_var))
  model.linear <- lm(paste0("log(",response_var, ")~", pred_var), data = mydata)
   a1 <- exp(model.linear$coefficients[1])
   b1 <- (model.linear$coefficients[2])
   R2 <- summary(model.linear)$r.squared

   myresults <- c(a1, b1, R2)
   return(myresults)
}
#########################################

Load dataset

# dat <- read.csv("deep-learning-all-features.csv", header = TRUE)
# save(dat, file = "Quantix_deep_learning.RData")

load("Quantix_deep_learning.RData", verbose = TRUE)
## Loading objects:
##   dat

Data preprocessing

Convert a set of columns from numeric to factors

# Convert a set of columns from numeric to factors

fcols <- c("Fieldname", "FlightDate", "Week",  "Type", "N", "Strip", "Mgmt_zone", 
           "DEM_type_Flat_Area", "DEM_type_Lower_Slope", "DEM_type_Middle_Slope", 
           "DEM_type_Ridge", "DEM_type_Upper_Slope", "DEM_type_Valley")
dat[fcols] <- lapply(dat[fcols], factor)

str(dat)
## 'data.frame':    1270534 obs. of  46 variables:
##  $ Fieldname            : Factor w/ 8 levels "DMD_GH1","PSF_111",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ FlightDate           : Factor w/ 22 levels "2019/06/07","2019/06/09",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DOY                  : int  158 158 158 158 158 158 158 158 158 158 ...
##  $ Week                 : Factor w/ 14 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Longitude            : num  43 43 43 43 43 ...
##  $ Latitude             : num  -76.6 -76.6 -76.6 -76.6 -76.6 ...
##  $ Yield                : num  182 188 191 194 197 ...
##  $ Yield_Mg.ha          : num  11.4 11.8 12 12.2 12.4 ...
##  $ Type                 : Factor w/ 2 levels "grain","silage": 1 1 1 1 1 1 1 1 1 1 ...
##  $ N                    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Strip                : Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ rgn_red              : num  18 18 18 18 18 18 18 18 18 18 ...
##  $ rgn_green            : num  16 15 15 15 15 15 15 15 15 16 ...
##  $ rgn_nir              : num  36 36 35 35 35 35 35 35 35 35 ...
##  $ rgb_red              : num  115 115 116 117 118 118 119 118 118 119 ...
##  $ rgb_green            : num  89 89 90 90 91 91 92 91 91 92 ...
##  $ rgb_blue             : num  67 67 68 68 69 69 70 69 69 70 ...
##  $ NDVI                 : num  0.333 0.333 0.321 0.321 0.321 ...
##  $ GNDVI                : num  0.385 0.412 0.4 0.4 0.4 ...
##  $ EVI2                 : num  0.597 0.616 0.59 0.59 0.59 ...
##  $ SR                   : num  2 2 1.94 1.94 1.94 ...
##  $ EXG                  : num  -4 -4 -4 -5 -5 -5 -5 -5 -5 -5 ...
##  $ TGI                  : num  3.28 3.28 3.28 2.89 2.89 2.89 2.89 2.89 2.89 2.89 ...
##  $ Mgmt_zone            : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 2 2 2 2 ...
##  $ Climod_GDD           : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ Climod_Temp_C        : num  15.6 15.6 15.6 15.6 15.6 ...
##  $ Climod_Precip_mm     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ACIS_GDD             : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ ACIS_Temp_C          : num  15.6 15.6 15.6 15.6 15.6 ...
##  $ ACIS_Precip_mm       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NASA_PAR             : num  159 159 159 159 159 ...
##  $ NASA_Temp_C          : num  17.3 17.3 17.3 17.3 17.3 ...
##  $ NASA_SH              : num  7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 ...
##  $ NASA_RH              : num  59.7 59.7 59.7 59.7 59.7 ...
##  $ NASA_Precip_mm       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NASA_WS              : num  0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
##  $ NASA_SurfWet         : num  0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 ...
##  $ NASA_RootWet         : num  0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 ...
##  $ NASA_ProfWet         : num  0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 ...
##  $ DEM                  : int  125 125 125 125 125 125 125 125 125 125 ...
##  $ DEM_type_Flat_Area   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DEM_type_Lower_Slope : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...
##  $ DEM_type_Middle_Slope: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DEM_type_Ridge       : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 1 1 ...
##  $ DEM_type_Upper_Slope : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DEM_type_Valley      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

Check for missing values

# Check if any columns have missing values 
sapply(dat, anyNA)
##             Fieldname            FlightDate                   DOY 
##                 FALSE                 FALSE                 FALSE 
##                  Week             Longitude              Latitude 
##                 FALSE                 FALSE                 FALSE 
##                 Yield           Yield_Mg.ha                  Type 
##                 FALSE                 FALSE                 FALSE 
##                     N                 Strip               rgn_red 
##                  TRUE                  TRUE                 FALSE 
##             rgn_green               rgn_nir               rgb_red 
##                 FALSE                 FALSE                 FALSE 
##             rgb_green              rgb_blue                  NDVI 
##                 FALSE                 FALSE                 FALSE 
##                 GNDVI                  EVI2                    SR 
##                 FALSE                 FALSE                 FALSE 
##                   EXG                   TGI             Mgmt_zone 
##                 FALSE                 FALSE                  TRUE 
##            Climod_GDD         Climod_Temp_C      Climod_Precip_mm 
##                 FALSE                 FALSE                 FALSE 
##              ACIS_GDD           ACIS_Temp_C        ACIS_Precip_mm 
##                 FALSE                 FALSE                 FALSE 
##              NASA_PAR           NASA_Temp_C               NASA_SH 
##                 FALSE                 FALSE                 FALSE 
##               NASA_RH        NASA_Precip_mm               NASA_WS 
##                 FALSE                 FALSE                 FALSE 
##          NASA_SurfWet          NASA_RootWet          NASA_ProfWet 
##                 FALSE                 FALSE                 FALSE 
##                   DEM    DEM_type_Flat_Area  DEM_type_Lower_Slope 
##                 FALSE                 FALSE                 FALSE 
## DEM_type_Middle_Slope        DEM_type_Ridge  DEM_type_Upper_Slope 
##                 FALSE                 FALSE                 FALSE 
##       DEM_type_Valley 
##                 FALSE
# Names of columns that has missing values
names(dat)[sapply(dat, anyNA)]
## [1] "N"         "Strip"     "Mgmt_zone"

Plot all yield data

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
selfield <- "SSF_121"

sub_dat <- dat %>%
  dplyr::filter(Fieldname == selfield) %>% 
  droplevels()

# Legend labeling - Splitting the yield range into equal classes
minyld <- round(min(sub_dat$Yield_Mg.ha),0) # minimum yield 
maxyld <- round(max(sub_dat$Yield_Mg.ha),0) # maximum yield
Nclasses <- 4      # Number of yield classes to split from the whole yield range

mybreaks <- round(seq(minyld, maxyld, len = (Nclasses+1)),0)
YE_colors <-  c("blue","cyan","lightgreen","yellow","orange", "red")

plt <- ggplot(sub_dat, aes(x = Latitude, y = Longitude)) +
  geom_point(aes(color = Yield_Mg.ha), size = 0.7) +    
      theme_bw() +
      # facet_grid(Field~Year, scales = "free") +
      theme(aspect.ratio = 1.3) +
      # ggtitle("Corn silage fields")+ 
      # theme(legend.position = c(0.85, 0.27))+
      scale_colour_gradientn(colours = YE_colors,  
                             name = "Yield (Mg/ha)",
                             limits=c(minyld,maxyld),
                             breaks = mybreaks,
                             labels = round(mybreaks, 0)) +
    guides(colour = guide_colourbar(barheight = unit(1.8, "in")))+
    theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), 
            axis.title.x = element_blank(), 
            axis.title.y = element_blank(),
            axis.text.x = element_blank(), 
            axis.text.y = element_blank(), 
            axis.ticks = element_blank(), 
            plot.title = element_text(size=14, face = "bold")) +
      theme(legend.title = element_text(face = "bold", size=8, margin = margin(0,0,7,0)),
            legend.text = element_text(vjust = 0.5),
            strip.text = element_text(face = "bold", size = 10))
      
plt

# png_save_name <- paste(selfield, "_", "allYear", ".png", sep = "")
# 
# wd <- 8.5
# ht <- 3 
# ggsave(png_save_name, width = wd, height = ht, unit = "in")

Exploratory data analysis

# Violin plot

vplt <- ggplot(dat, aes(x = Fieldname, y = Yield_Mg.ha, fill = Fieldname)) +
  geom_violin() +
  theme_bw() +
  facet_wrap(~Type, nrow = 2, scales = "free_y")

vplt

Weather vs NDVI

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
# "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
# Best grain field - DMD_GH1
# Best sialge filed - SSF_121, PSF_12

selfield <- "DMD_GH1"
# seltype <- "silage"

sub_dat <- dat %>% 
  filter(Fieldname == selfield) %>% 
  droplevels()

mu <- ddply(sub_dat, "Week", summarise, ndvi.mean = mean(NDVI), weather.mean = mean(NASA_PAR))

plt1 <- ggplot(mu, aes(x = weather.mean, y = ndvi.mean)) + 
  # geom_boxplot() + 
  geom_point(size = 2.5) + 
  geom_line() + 
  theme_bw() + 
  xlab("NASA_PAR") + 
  ylab("NDVI")

plt1

Subset only N strip data

ndat <- dat %>% 
  filter(!is.na(N))

names(ndat)[sapply(ndat, anyNA)]
## [1] "Mgmt_zone"

Subset results for one field

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
# "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
# Best grain field - DMD_GH1
# Best sialge filed - SSF_121, PSF_12

selfield <- "SSF_121"
# seltype <- "silage"

sub_dat <- ndat %>% 
  filter(Fieldname == selfield) %>% 
  droplevels()

# "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
lvl <- levels(sub_dat$Week)
sub_dat$Strip <- as.numeric(sub_dat$Strip)

r2df <- data.frame()
for (i in 1:length(lvl)){
  print(i)
  wsub_dat <- sub_dat %>% 
    filter(Strip <= 4) %>%
    filter(Week == lvl[i]) %>%
    droplevels()
  mod <- myexpfit("Yield_Mg.ha", "NDVI", wsub_dat)
  r2 <- c(lvl[i], mod[1], mod[2], mod[3])
  r2df <- rbind(r2df, r2)
  print(paste0(i, " = ", mod[3]))
}
## [1] 1
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "1 = 0.108986777872688"
## [1] 2
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "2 = 0.033914704266872"
## [1] 3
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "3 = 0.0897048512691024"
## [1] 4
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "4 = 0.0353783962271664"
## [1] 5
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "5 = 0.254804905319651"
## [1] 6
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "6 = 0.532648000702073"
## [1] 7
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "7 = 0.446660462128851"
## [1] 8
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "8 = 0.362433519079672"
## [1] 9
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "9 = 0.359349081579489"
## [1] 10
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "10 = 0.201208422613502"
## [1] 11
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "11 = 0.267718102984678"
## [1] 12
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "12 = 0.243352476662727"
## [1] 13
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "13 = 0.302828726265256"
## [1] 14
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "14 = 0.0905284550943136"
colnames(r2df) <- c("Weeks", "a", "b", "R2")

points_sf <- st_as_sf(sub_dat, coords = c("Longitude", "Latitude"), crs = 32618, remove = FALSE)
plot(points_sf["Yield_Mg.ha"])

numeric_cols <- c("Weeks", "a", "b", "R2")
r2df[numeric_cols] <- lapply(r2df[numeric_cols], as.numeric)

r2plot <- ggplot(r2df, aes(x = Weeks, y = R2)) +
  geom_point(size = 2) +
  geom_line() +
  theme_bw() + 
  scale_y_continuous(breaks = seq(0,1.0, 0.1)) +
  scale_x_continuous(breaks = seq(1,15, 1))

# r2plot

ggplotly(r2plot)

Pick the best week

best_week <- which.max(r2df$R2)

sub_dat <- dat %>% 
  filter(Week == best_week & Fieldname == selfield)

a_coeff <- r2df[best_week, 2]
b_coeff <- r2df[best_week, 3]

sub_dat$Pred_Yield <-  a_coeff*exp(b_coeff*sub_dat$NDVI)

sub_dat$error <- (sub_dat$Pred_Yield - sub_dat$Yield_Mg.ha)
mse <- mean(sub_dat$error^2)
rmse <- sqrt(mse)
cat("RMSE for", selfield, "on week", best_week, "is", rmse, "Mg/ha")
## RMSE for SSF_121 on week 6 is 7.154346 Mg/ha

Save actual and predicted yield into a CSV

sel_dat <- sub_dat %>% 
  select(c(Longitude, Latitude, Pred_Yield, Yield_Mg.ha))

colnames(sel_dat) <- c("Longitude", "Latitude", "Predicted_yield", "Actual_yield")

field_save_name <- paste0(selfield, "_Week", best_week, ".csv")

# write.csv(sel_dat, field_save_name, row.names = FALSE)